home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
SGI Hot Mix 17
/
Hot Mix 17.iso
/
HM17_SGI
/
research
/
lib
/
obsolete
/
join.pro
< prev
next >
Wrap
Text File
|
1997-07-08
|
10KB
|
408 lines
; $Id: join.pro,v 1.2 1997/01/15 04:02:19 ali Exp $
;
; Copyright (c) 1991-1997, Research Systems Inc. All rights
; reserved. Unauthorized reproduction prohibited.
Function EuclidRule, Case1, RCases
; Dist returns the vector of Euclidean Distances from Case
; to the other cases in the array RCase
SC=Size(RCases)
C=SC(1)
if(SC(0) GT 1) THEN R=SC(2) ELSE R=1
M1= Case1#(Fltarr(R)+1)
M2=Fltarr(C)+1
M1=M1-RCases
if(R EQ 1) THEN Return,SQRT(Total(M1*M1)) $
else Return, transpose (SQRT(M2#(M1*M1)))
END
Function Distance1,Case1,RCases,N
; Determine appropriate rule for computing distances between
; cases and return the vector of distances between
; RCases and Case1.
SC=Size(RCases)
S=SC(1)
if(SC(0) GE 2) THEN R=SC(2) else R=1
Case N of
"%":BEGIN
X=Case1#replicate(1.,R) - RCases
A=where(X NE 0,count)
if(count NE 0) THEN X(A)=1
if(SC(0) GT 1) THEN return, $
transpose(replicate(1.,S)#X*1/S) $
ELSE return,Total(X)*1/S
END
"COR":BEGIN
V= [Correlate(Case1,RCases(*,0))]
for i=1,R-1 DO V=[V,Correlate(Case1,RCases(*,i))]
return,1-V
END
ELSE: return,EuclidRule(Case1,RCases)
ENDCASE
END
Function Normal1, Data,R,C
;Normal returns Data normalized by columns
Y= Data-Data#Replicate(1./R,R) #Replicate(1.,R)
std =sqrt(Y^2 # Replicate(1./(R-1),R))
D1=Fltarr(c,c)
for i=0,C-1 do $
if std(i) NE 0 then $
D1(i,i)=1./std(i) else d1(i,i)=0
return, D1#Y
end
Function FindRow,I,R
; D is a symmetric matrix stored linearly. FindRow
; reconstructs the ith row. R is the total number of rows.
Common JBlock,D,INDEX
case I of
R-1:Begin ;RETURN,(R-1)*(R-2)/2:(R+1)*(R-2)/2)
X=LINDGEN((R+1)*(R-2)/2 +1)
Return,X((R-1)*(R-2)/2:(R+1)*(R-2)/2)
END
0: BEGIN
T=LIndgen(R-1)
RETURN,(1+T)*T/2
END
ELSE: BEGIN
A1=I*(I-1)/2
A2=(I+1)*(I+2)/2 -1
T=LIndGen(R-I-1)
T=(T+1)*T/2
RETURN,[A1+LIndgen(I),A2 + I*LIndgen(R-I-1 )+T]
END
ENDCASE
END
Pro MinDist,R,DMin,IMin,I
; MinDist computes the minimum distance from case I to any
; other case using the symmetric distance matrix D. This
; distance and the corresponding row number are returned
; in DMin and
; IMIN respectively.
Common JBLock,D,INDEX
DMin(I)=min(D(FindRow(INDEX(I),R)))
if(!C LE INDEX(I)-1) THEN IMIN(I)=INDEX(!C) else $
IMIn(I)=INDEX(!C+1)
RETURN
END
Pro SetVal,V,I,R
Common JBlock,D,INDEX
D(FindRow(INDEX(I),R))=v
RETURN
END
FUNCTION AmalDist,I1,J1,R,pos,Am,cl,here
;AmalDist computes the distances from the cluster formed
; from I and J to the other cases and clusters as the
; minimum distance between members
Common JBlock,D,INDEX
I = INDEX(I1)
J = INDEX(J1)
X1=Fltarr(R)
Y1=Fltarr(R)
here = FindRow(I,R)
X=D(here)
case I of
R-1:X1=[X,1.e30]
0: X1=[1.e30,X]
ELSE:X1=[X(0:I-1),1.e30,X(I:R-2)]
ENDCASE
Y=D(FindRow(J,R))
case J of
R-1:Y1=[Y,1.e30]
0: Y1=[1.e30,y]
ELSE:Y1=[Y(0:J-1),1.e30,Y(J:R-2)]
ENDCASE
Ind=Where(Y1 EQ 1.e30)
Y1(Where(X1 EQ 1.e30))= 1.e30
X1(Ind)=1.e30
INDEX(cl) = I
INDEX(I) = cl
case Am of
"MAX" : V=X1>Y1
"MEAN": V=(pos(I)*X1+pos(J)*Y1)/(pos(I)+pos(J))
ELse : V=X1<Y1
ENDCASE
case I of
0: Return, V(1:*)
R-1: Return,V(0:R-2)
ELSE: Return,[V(0:I-1),V(I+1:*)]
ENDCASE
END
Pro Join, DataArray,pos,Imin,Sim,CaseName = CaseName, $
List_Name=LN,Norm=NR,Missing=M, Pair=pr, $
Distance=Dx, Amal=AM, NoPrint =NP, Width = LS
Common JBlock, D,Index
;+
; NAME:
; JOIN
;
; PURPOSE:
; To partition the cases represented by the rows in DataArray into
; nested clusters.
;
; CATEGORY:
; Statistics.
;
; CALLING SEQUENCE:
; JOIN, DataArray, pos, Imin, Sim
;
; INPUTS:
; DataArray: a (C,R) dimensioned array where R is the
; number of cases to be partitioned and C
; is the number of variables.
;
; KEYWORDS
; CASENAME: one dimensional array of R case names.
;
; NORM: Flag to signal whether to normalize the
; variable values in Data. Values normalized only if Norm=1.
;
; MISSING: Missing data value. If undefined, assume no missing data
;
; DISTANCE: Rule for computing case distances.
; Options are:
; 1. "%": Distance = percent of variable values that disagree
; between two cases.
; 2. "EUCLID": This is the default.
; 3. "COR": Distance between i and
; j = 1-rij, where rij= the correlation
; between i and j.
; 4. "OWN": DataArray= a symmetric a
; distance array supplied by the user. This array
; should be a 1-dim array of the elements in the
; distance array that are below the main diagonal.
;
; AMAL: Rule for computing cluster distances. Options are:
; 1. "MIN": distance = distance between closest members.
; 2. "MAX": distance = distance between farthest members.
; 3. "MEAN" : distance = average of distances between members.
;
; NOPRINT: A flag, if set, to suppress output to the screen.
;
; WIDTH: User supplied tree width in characters. The default is 60.
;
; OUTPUT:
; The tree of hierarchical clusters is printed to the screen.
;
; OPTIONAL OUTPUT PARAMETERS:
; Pos: One-dimensional array of cases in the in the tree of
; hierarchical clusters. Pos(i) = position of case i in tree.
;
; Imin: One-dimensional array specifying nesting of clusters.
; Imin(i) = smallest cluster properly containing cluster or
; case i.
;
; Sim: One-dimensional array of minimum distances.
; Sim(i) = the distance between the two clusters joined to
; form cluster i.
;
; RESTRICTIONS:
; None.
;
; COMMON BLOCKS:
; None.
;
; SIDE EFFECTS:
; None.
;
; PROCEDURE:
; Adapted from algorithm in Clustering Algorithms by
; Hartigan, Wiley Series in Probablity and Mathematical
; Statistics, Chapt.4. Function kmeans1 implements a
; function that given a partition P returns a
; partition P' in the same neighborhood with reduced
; in group error. Function is called repeatedly until
; it finds a fixed point or local minimum. Kmeans1
; recomputes cluster means after each reassignment.
; Procedure Kmeans successively finds partitions
; with the starting partition for K the final partition
; for K-1 with the case farthest from its cluster mean
; split off to form a new cluster.
;-
On_Error,2
SD=Size(DataArray)
if (N_ELements(LN) EQ 0) THEN unit = -1 $
else openw,unit,/Get,LN
if (SD(0) NE 2) THEN BEGIN
printf,unit, "Join- Data Matrix must be 2-dimensional"
goto,quit0
ENDIF
Data = DataArray
if (N_ELEMENTS(M) NE 0) THEN BEGIN
Data = listwise(Data, M)
SD = size(Data)
if N_ELEMENTS(Data) le 1 or SD(0) eq 1 THEN BEGIN
printf,unit,"join- Halting, too many missing data values."
goto,quit0
ENDIF
Endif
C= SD(1)
R=SD(2)
Inf= 1.e+30
if N_ELEMENTS(NR) NE 0 THEN $
Data=Normal1(Data,R,C) ;normalize data
if(N_Elements(Dx) EQ 0) THEN Dx="EUCLID" ;if not specified,
; euclidean
;distance computed
if(N_Elements(Am) EQ 0) THEN Am ="MIN"
;default amalgamation
; policy
if(Dx EQ 'OWN') THEN D=Data else Begin
;user specifies distances
;between cases
V=Distance1(Data(*,1),Data(*,0),Dx)
;Distance between first two cases print,"V=",V
D=[V] ;initialize symmetric Distance
;matrix D
;Note that D is 1-dimensional
for i=2l,R-1 DO BEGIN ;Construct the rest of D
V=Distance1(Data(*,i),Data(*,0:I-1),Dx )
D=[D,V]
ENDFOR
ENDELSE
Index = Findgen(2*R) ;initialize indices
Sim=Fltarr(2*R) ;measure of closeness of cases
;in clusters
DMin=Fltarr(2*R) ;
;DDMin=DMIN
IMin=Intarr(2*R)
for I=0l,R-1 DO BEGin
MinDist,R,DMin,IMin,I ;Compute min distance between
ENDFOR ; cases and store distance in
;DMIN(i) and nearest case to i
; in IMIN(I)
RN=R
Pos=[Intarr(RN) +1,Intarr(RN-1)]
for i= RN,2*RN-2 DO BEGIN ; Amalgamate clusters
DM=Min(DMin(0:R-1),MI) ; Find closest clusters
Sim(I)=DM
MJ=IMin(MI)
DMin(MI)= inf & IMin(MI)= I & DMin(MJ)=inf & IMin(MJ)=I
; and combine
V= AmalDist(MI,MJ,RN,pos,Am,i,here)
D(here) = V
R=R+1
if (i NE (2*RN-2)) THEN BEGIN
MinDist,RN,DMin,IMin,I
Pos(i) = Pos(MJ)+Pos(MI)
Ind=where((DMin(0:R-1) GT V) AND $
(DMIN(0:R-1) NE inf) ,count)
;Compare distance to new cluster with min
; and update
if(count NE 0) THEN BEGIN
DMin(Ind) = V(Ind)
IMin(Ind) = I
ENDIF
SetVal,inf,MJ,RN
for J= 0l,R-1 DO BEGIN ;Recompute min distance where
;IMin = one of the amalgamated
;clusters
if(IMin(j) EQ MJ) OR ( IMin(j) EQ MI) THEN BEGIN
if(DMin(j) NE inf) THEN MinDist,RN,DMin,IMin,J
ENDIF
ENDFOR
ENDIF
ENDFOR ; Amalgamation completed
Pos=[Intarr(RN) +1,Intarr(RN-1)]
for i=0l,2*RN-3 DO Pos(IMIN(I))=Pos(IMIN(I))+Pos(I)
;Pos(I)= # of objects in Ith cluster
for i=0l,2*RN-3 DO BEGIN ;Pos(i) = position of i in tree
j=2*RN-i-3
temp=Pos(j)
Pos(j)= Pos(IMin(j))
Pos(IMin(j))=Pos(IMin(j))-temp
ENDFOR
pos=pos(0:RN-1)-1
if( N_Elements(NP) EQ 0) THEN maketree,pos,imin,SIM,unit, $
CaseName=CaseName,Width = LS
quit0:
if(unit NE -1) THEN Free_Lun,unit
RETURN
END